library(forecast)
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(readr)
TRANSIT <- read_csv("TRANSIT.csv") 
## Rows: 283 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl  (1): TRANSIT
## date (1): DATE
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Transit_Raw <- TRANSIT$TRANSIT
# Plot and Inference

Transit_ts <- ts(Transit_Raw, frequency = 12, start = c(2000,1)) 
plot(Transit_ts)

# The series displays consistent seasonal variation and stable ridership before 2020, followed by a possibly sharp COVID-related collapse and a slow, and recovery afterward.
# Central Tendency

summary(Transit_ts)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  171450  772502  824766  779435  866164  993437
boxplot(Transit_ts)

# The summary statistics indicate that transit ridership was generally high and stable, with most monthly values clustered between roughly 1st quartile and 3rd quartile, and a median of 824,766. The mean is slightly lower than the median, suggesting a slight left-skew influenced by lower values.
# The box plot shows generally stable pre-2020 ridership levels with a tight central range, alongside many low-value outliers caused by the COVID-related collapse in transit usage.
# Decomposition

decomp <- stl(Transit_ts,s.window ="periodic")
plot(decomp)

attributes(decomp)
## $names
## [1] "time.series" "weights"     "call"        "win"         "deg"        
## [6] "jump"        "inner"       "outer"      
## 
## $class
## [1] "stl"
# The decomposition clearly shows repeating cycles each year, with peaks and troughs occurring at roughly the same months.That means the series is seasonal, exhibiting a strong annual pattern.
# The decomposition is additive, because the seasonal component has a constant magnitude over time.

# Monthly Indices
monthly_indices <- seasonally_adjusted <- decomp$time.series[, "seasonal"]
monthly_indices[1:12]
##  [1] -28344.4773 -44993.0331  32372.4450  -3295.0362  15894.3269 -10193.2025
##  [7] -21748.6764   -220.9989  22783.8112  70049.5800  -2440.5805 -29864.1587
# The series shows its peak in October and its lowest level in February.
# Ridership is likely higher in October due to stable commuting patterns, mild weather, and active work and school schedules. In contrast, the cold winter conditions and reduced travel in February contribute to lower ridership levels.

# Seasonality Adjustment
Transit_adj <- seasadj(decomp)
plot(Transit_ts)
lines(Transit_adj, col="Red")

# Seasonality has a noticeable impact on the series, with monthly ridership fluctuating by roughly 30,000 to 70,000 riders depending on the time of year.
# Naive Method

naive_model <- naive(Transit_ts)
plot(naive_model$residuals)

# The residuals are mostly centered around zero with no strong recurring pattern, suggesting the model fits the data reasonably well under normal conditions. However, the large spike in residuals around 2020 indicates that the model fails to capture the abrupt COVID-related collapse in ridership.

hist(naive_model$residuals)

# The residuals are mostly clustered around zero, but the histogram is left-skewed due to several very large negative residuals from the COVID-19 collapse. This indicates that while the model fits normal periods reasonably well, it does not handle the sudden structural break in 2020.


# fitted values vs residuals
plot(naive_model$fitted, naive_model$residuals)
abline(h = 0, col = "red")

# The residuals are mostly centered around zero, but the presence of several large negative outliers indicates that the model fails to account for the abrupt COVID-related decline. This suggests the model fits typical periods reasonably well but does not capture structural shocks.

# actual values vs residuals
plot(Transit_ts, naive_model$residuals)
abline(h = 0, col = "red")

# The plot shows that most residuals cluster around zero, indicating that the model fits the majority of the data well. However, there are several large negative residuals when actual ridership is very low, reflecting the pandemic collapse that the model was unable to predict.

# ACF of Residuals
Acf(naive_model$residuals)

# The ACF plot shows significant autocorrelation at multiple lags, especially at lag 12, indicating that the residuals are not random. This means the naive model does not adequately capture the seasonal structure of the time series.

# Accuracy
accuracy(naive_model)
##                     ME    RMSE      MAE        MPE     MAPE      MASE
## Training set -683.4326 55803.6 41483.85 -0.6755225 6.063339 0.7298063
##                    ACF1
## Training set -0.1325041
#Forecast
naive_forecast <- naive(Transit_ts,12)
plot(naive_forecast)

summary(naive_forecast)
## 
## Forecast method: Naive method
## 
## Model Information:
## Call: naive(y = Transit_ts, h = 12) 
## 
## Residual sd: 55803.6048 
## 
## Error measures:
##                     ME    RMSE      MAE        MPE     MAPE      MASE
## Training set -683.4326 55803.6 41483.85 -0.6755225 6.063339 0.7298063
##                    ACF1
## Training set -0.1325041
## 
## Forecasts:
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Aug 2023         532206 460690.8 603721.2 422832.9 641579.1
## Sep 2023         532206 431068.2 633343.8 377529.1 686882.9
## Oct 2023         532206 408338.0 656074.0 342766.3 721645.7
## Nov 2023         532206 389175.6 675236.4 313459.9 750952.1
## Dec 2023         532206 372293.2 692118.8 287640.4 776771.6
## Jan 2024         532206 357030.3 707381.7 264297.8 800114.2
## Feb 2024         532206 342994.6 721417.4 242832.1 821579.9
## Mar 2024         532206 329930.5 734481.5 222852.3 841559.7
## Apr 2024         532206 317660.4 746751.6 204086.8 860325.2
## May 2024         532206 306055.1 758356.9 186338.0 878074.0
## Jun 2024         532206 295016.9 769395.1 169456.6 894955.4
## Jul 2024         532206 284470.1 779941.9 153326.6 911085.4
# The naive forecasting model has moderate accuracy, with a MAPE of about 6%, meaning its predictions are generally close during normal periods but do not capture sudden changes like COVID. Naive model is simple and works for stable patterns, but it does not account for trend or seasonality.
# Simple Moving Averages

attributes(Transit_ts)
## $tsp
## [1] 2000.0 2023.5   12.0
## 
## $class
## [1] "ts"
frequency(Transit_ts)
## [1] 12
start(Transit_ts)
## [1] 2000    1
end(Transit_ts)
## [1] 2023    7
plot(Transit_ts)

MA3 <- ma(Transit_ts,order=3)
plot(MA3)

MA6 <- ma(Transit_ts,order=6)
plot(MA6)

MA9 <- ma(Transit_ts,order=9)
plot(MA9)

plot(Transit_ts)
lines(Transit_ts)
lines(MA3, col = "red", lwd = 2)
lines(MA6, col = "blue", lwd = 2)
lines(MA9, col = "green", lwd = 2)

# forecast for the next 12 month
MA3_forecast <- forecast(MA3, h = 12)
## Warning in ets(object, lambda = lambda, biasadj = biasadj,
## allow.multiplicative.trend = allow.multiplicative.trend, : Missing values
## encountered. Using longest contiguous portion of time series
MA6_forecast <- forecast(MA6, h = 12)
## Warning in ets(object, lambda = lambda, biasadj = biasadj,
## allow.multiplicative.trend = allow.multiplicative.trend, : Missing values
## encountered. Using longest contiguous portion of time series
MA9_forecast <- forecast(MA9, h = 12)
## Warning in ets(object, lambda = lambda, biasadj = biasadj,
## allow.multiplicative.trend = allow.multiplicative.trend, : Missing values
## encountered. Using longest contiguous portion of time series
summary(MA9_forecast)
## 
## Forecast method: ETS(A,Ad,N)
## 
## Model Information:
## ETS(A,Ad,N) 
## 
## Call:
## ets(y = object, lambda = lambda, biasadj = biasadj, allow.multiplicative.trend = allow.multiplicative.trend)
## 
##   Smoothing parameters:
##     alpha = 0.9999 
##     beta  = 0.9999 
##     phi   = 0.8 
## 
##   Initial states:
##     l = 790233.3054 
##     b = 2301.1594 
## 
##   sigma:  8771.733
## 
##      AIC     AICc      BIC 
## 6545.175 6545.489 6566.876 
## 
## Error measures:
##                     ME     RMSE      MAE        MPE      MAPE      MASE
## Training set -195.8352 8691.624 6867.204 0.00766116 0.9437753 0.1484077
##                    ACF1
## Training set 0.06643575
## 
## Forecasts:
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Apr 2023       548293.2 537051.7 559534.6 531100.9 565485.4
## May 2023       544672.6 521526.8 567818.3 509274.2 580070.9
## Jun 2023       541776.1 505888.4 577663.8 486890.5 596661.6
## Jul 2023       539458.9 490582.2 588335.6 464708.4 614209.4
## Aug 2023       537605.2 475825.9 599384.4 443122.0 632088.4
## Sep 2023       536122.2 461715.7 610528.7 422327.2 649917.1
## Oct 2023       534935.8 448281.0 621590.5 402408.7 667462.8
## Nov 2023       533986.6 435514.7 632458.6 383386.8 684586.5
## Dec 2023       533227.3 423389.4 643065.2 365244.8 701209.9
## Jan 2024       532619.9 411867.7 653372.1 347945.3 717294.5
## Feb 2024       532133.9 400907.4 663360.4 331440.3 732827.5
## Mar 2024       531745.2 390465.9 673024.4 315677.2 747813.1
accuracy(MA9_forecast)
##                     ME     RMSE      MAE        MPE      MAPE      MASE
## Training set -195.8352 8691.624 6867.204 0.00766116 0.9437753 0.1484077
##                    ACF1
## Training set 0.06643575
# As the moving average order increases from 3 to 9, the plot becomes smoother and less noisy, but also lags behind the actual data, so it better for long-term trend identification but less effective for short-term forecasting.
# Simple Smoothing

SSE_forecast <- ses(Transit_ts,h=12)
SSE_forecast
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Aug 2023       539578.3 468678.2 610478.5 431145.9 648010.7
## Sep 2023       539578.3 447103.9 632052.8 398150.9 681005.8
## Oct 2023       539578.3 429686.5 649470.1 371513.3 707643.4
## Nov 2023       539578.3 414674.8 664481.9 348554.8 730601.9
## Dec 2023       539578.3 401283.0 677873.6 328073.9 751082.8
## Jan 2024       539578.3 389078.2 690078.4 309408.3 769748.4
## Feb 2024       539578.3 377791.5 701365.1 292146.7 787009.9
## Mar 2024       539578.3 367242.4 711914.2 276013.3 803143.4
## Apr 2024       539578.3 357302.8 721853.8 260812.0 818344.6
## May 2024       539578.3 347877.9 731278.7 246397.9 832758.8
## Jun 2024       539578.3 338895.2 740261.5 232659.9 846496.8
## Jul 2024       539578.3 330297.6 748859.1 219511.1 859645.6
summary(SSE_forecast)
## 
## Forecast method: Simple exponential smoothing
## 
## Model Information:
## Simple exponential smoothing 
## 
## Call:
## ses(y = Transit_ts, h = 12)
## 
##   Smoothing parameters:
##     alpha = 0.8374 
## 
##   Initial states:
##     l = 732001.6666 
## 
##   sigma:  55323.68
## 
##      AIC     AICc      BIC 
## 7782.916 7783.002 7793.852 
## 
## Error measures:
##                     ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -812.0035 55127.84 40313.36 -0.8363806 6.026904 0.7092143
##                    ACF1
## Training set 0.01607982
## 
## Forecasts:
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Aug 2023       539578.3 468678.2 610478.5 431145.9 648010.7
## Sep 2023       539578.3 447103.9 632052.8 398150.9 681005.8
## Oct 2023       539578.3 429686.5 649470.1 371513.3 707643.4
## Nov 2023       539578.3 414674.8 664481.9 348554.8 730601.9
## Dec 2023       539578.3 401283.0 677873.6 328073.9 751082.8
## Jan 2024       539578.3 389078.2 690078.4 309408.3 769748.4
## Feb 2024       539578.3 377791.5 701365.1 292146.7 787009.9
## Mar 2024       539578.3 367242.4 711914.2 276013.3 803143.4
## Apr 2024       539578.3 357302.8 721853.8 260812.0 818344.6
## May 2024       539578.3 347877.9 731278.7 246397.9 832758.8
## Jun 2024       539578.3 338895.2 740261.5 232659.9 846496.8
## Jul 2024       539578.3 330297.6 748859.1 219511.1 859645.6
# As the showing result, the smoothing parameter alpha is approximately 0.8374, meaning the model assigns 84% weight to the most recent observation, resulting in a model that responds quickly to recent changes. The initial level 732,001.67 represents the starting smoothed estimate of the series. The sigma value of approximately 55,324 reflects the variability of the residuals, suggesting moderate forecast error but generally stable performance.

# Residual Analysis
plot(SSE_forecast$residuals)

# The residual plot shows no consistent trend or repeating pattern, indicating that the SES model has captured the overall level of the series fairly well. However, there is a noticeable large negative residual around 2020, which reflects the sudden drop in ridership during pandemic that the model could not anticipate.

hist(SSE_forecast$residuals)

# The histogram shows most residuals centered near zero, but a long left tail caused by the COVID-related drop indicates the model could not capture the sudden structural change.

# fitted values vs residuals
plot(SSE_forecast$fitted, SSE_forecast$residuals)
abline(h = 0, col = "red")

# The fitted-vs-residuals plot for the SES model shows mostly random scatter around zero, indicating a better fit than the naive and moving average models, though large negative residuals during the COVID drop remain unaccounted for.

# actual values vs residuals
plot(Transit_ts, SSE_forecast$residuals)
abline(h = 0, col = "red")

# The plot shows that most residuals are centered around zero, indicating a reasonable fit, but the large negative residuals when actual values are low show that the model still fails to capture the sudden structural change in the series.

# ACF of Residuals
Acf(SSE_forecast$residuals)

# The ACF plot of the SES residuals shows fewer significant autocorrelations than the naive and moving average models, indicating that SES captures the underlying level better; however, the spike around lag 12 shows remaining seasonality that the model does not fully remove.

# Accuracy
accuracy(SSE_forecast)
##                     ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -812.0035 55127.84 40313.36 -0.8363806 6.026904 0.7092143
##                    ACF1
## Training set 0.01607982
# The SES model shows better accuracy than the naive and moving average models (lower RMSE and MAE), meaning it fits the data more closely. It forecasts that ridership in one year will stay close to the current level, since it weights recent data more heavily. However, it still does not fully capture seasonality or the sharp pandemic drop.
# Holt-Winters

HW <- HoltWinters(Transit_ts)
plot(HW)

HW_forecast <- forecast(Transit_ts,h=12)
plot(HW_forecast)

summary(HW_forecast)
## 
## Forecast method: ETS(A,N,A)
## 
## Model Information:
## ETS(A,N,A) 
## 
## Call:
## ets(y = object, lambda = lambda, biasadj = biasadj, allow.multiplicative.trend = allow.multiplicative.trend)
## 
##   Smoothing parameters:
##     alpha = 0.9999 
##     gamma = 1e-04 
## 
##   Initial states:
##     l = 805081.3493 
##     s = -29928.9 -2604.664 70008.64 22793.36 -189.8851 -20385.34
##            -10941.57 16412.36 -1776.894 31546.4 -45757.8 -29175.71
## 
##   sigma:  41374.59
## 
##      AIC     AICc      BIC 
## 7630.122 7631.920 7684.804 
## 
## Error measures:
##                     ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -892.2577 40338.21 25687.09 -0.5431477 4.095672 0.4519012
##                     ACF1
## Training set 0.009813671
## 
## Forecasts:
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Aug 2023       552406.3 499382.7 605430.0 471313.6 633499.0
## Sep 2023       575389.4 500406.4 650372.4 460712.8 690065.9
## Oct 2023       622604.8 530771.4 714438.3 482157.7 763052.0
## Nov 2023       549991.7 443952.5 656031.0 387818.7 712164.8
## Dec 2023       522667.4 404112.5 641222.3 341353.3 703981.5
## Jan 2024       523420.1 393550.2 653290.1 324801.2 722039.1
## Feb 2024       506838.9 366563.7 647114.1 292306.4 721371.3
## Mar 2024       584143.3 434183.1 734103.6 354798.9 813487.7
## Apr 2024       550814.2 391757.5 709870.8 307558.0 794070.3
## May 2024       569011.4 401351.2 736671.7 312597.2 825425.6
## Jun 2024       541658.0 365814.6 717501.3 272728.7 810587.2
## Jul 2024       532206.0 348542.2 715869.8 251316.5 813095.6
# The Holt-Winters model estimated alpha = 0.9999, meaning it gives almost full weight to the most recent data and adjusts the level very quickly. There is no beta term because the model includes no trend component. Gamma = 0.0001 indicates that the seasonal pattern is very stable over time. The initial level (= 805,081) represents the baseline ridership, and the seasonal values show which months are above or below average. Sigma = 41,375 indicates the typical size of forecast errors, reflecting a relatively good model fit compared to simpler methods.


# Residual Analysis
plot(HW_forecast$residuals)

# Residuals are mostly random around zero, showing Holt-Winters fits better than naïve, moving average, and SES, except during the pandemic drop.

hist(HW_forecast$residuals)

# The histogram is centered near zero with a smaller spread, indicating improved accuracy, though a left tail remains due to pandemic.

# fitted values vs residuals
plot(HW_forecast$fitted, HW_forecast$residuals)
abline(h = 0, col = "red")

# Residuals show no pattern and are tighter than in the naive, moving average, and SES models, indicating a better fit.

# actual values vs residuals
plot(Transit_ts, HW_forecast$residuals)
abline(h = 0, col = "red")

# Most residuals are small, with Holt-Winters reducing error more effectively than the other models, except during the pandemic shock.

# ACF of Residuals
Acf(HW_forecast$residuals)

# The ACF plot of the SES residuals shows fewer significant autocorrelations than the naive and moving average models, indicating that SES captures the underlying level better; however, the spike around lag 12 shows remaining seasonality that the model does not fully remove.

# Accuracy
accuracy(HW_forecast)
##                     ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -892.2577 40338.21 25687.09 -0.5431477 4.095672 0.4519012
##                     ACF1
## Training set 0.009813671
# The ACF of residuals shows little to no significant autocorrelation, indicating the model has successfully captured both level and seasonality. This is a clear improvement compared to naïve and moving average models, which showed strong autocorrelation, and a modest improvement over SES, which struggled with seasonal structure.

# Accuracy
accuracy(HW_forecast)
##                     ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -892.2577 40338.21 25687.09 -0.5431477 4.095672 0.4519012
##                     ACF1
## Training set 0.009813671
# The Holt-Winters model shows the best accuracy among the naïve, moving average, and SES models, with the lowest RMSE and MAE, meaning it fits the data more closely. It predicts that ridership next year will remain near the current post-COVID levels, while still reflecting the seasonal ups and downs.
# Accuracy Summary

accuracy(naive_forecast)
##                     ME    RMSE      MAE        MPE     MAPE      MASE
## Training set -683.4326 55803.6 41483.85 -0.6755225 6.063339 0.7298063
##                    ACF1
## Training set -0.1325041
accuracy(MA3_forecast)
##                     ME     RMSE      MAE         MPE     MAPE      MASE
## Training set -474.3732 21447.51 9418.979 -0.06588614 1.651357 0.1843854
##                   ACF1
## Training set 0.5855302
accuracy(MA6_forecast)
##                     ME     RMSE      MAE        MPE     MAPE       MASE
## Training set -90.14435 7087.279 4032.781 0.05631012 0.631004 0.08383187
##                   ACF1
## Training set 0.4079666
accuracy(MA9_forecast)
##                     ME     RMSE      MAE        MPE      MAPE      MASE
## Training set -195.8352 8691.624 6867.204 0.00766116 0.9437753 0.1484077
##                    ACF1
## Training set 0.06643575
accuracy(SSE_forecast)
##                     ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -812.0035 55127.84 40313.36 -0.8363806 6.026904 0.7092143
##                    ACF1
## Training set 0.01607982
accuracy(HW_forecast)
##                     ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -892.2577 40338.21 25687.09 -0.5431477 4.095672 0.4519012
##                     ACF1
## Training set 0.009813671
acc_Naive <- accuracy(naive_forecast)
acc_MA3 <- accuracy(MA3_forecast)
acc_MA6 <- accuracy(MA6_forecast)
acc_MA9 <- accuracy(MA9_forecast)
acc_SES <- accuracy(SSE_forecast)
acc_HW <- accuracy(HW_forecast)

accuracy_table <- rbind("Naive" = acc_Naive, "MA3" = acc_MA3, "MA6" = acc_MA6, "MA9" = acc_MA9, "SES" = acc_SES, "HW" = acc_HW)
accuracy_table <- rbind(
  "Naive" = acc_Naive["Training set", ],
  "MA3"   = acc_MA3["Training set", ],
  "MA6"   = acc_MA6["Training set", ],
  "MA9"   = acc_MA9["Training set", ],
  "SES"   = acc_SES["Training set", ],
  "HW"    = acc_HW["Training set", ]
)
round(accuracy_table, 4)
##              ME      RMSE       MAE     MPE   MAPE   MASE    ACF1
## Naive -683.4326 55803.605 41483.851 -0.6755 6.0633 0.7298 -0.1325
## MA3   -474.3732 21447.510  9418.979 -0.0659 1.6514 0.1844  0.5855
## MA6    -90.1444  7087.279  4032.781  0.0563 0.6310 0.0838  0.4080
## MA9   -195.8352  8691.624  6867.204  0.0077 0.9438 0.1484  0.0664
## SES   -812.0035 55127.840 40313.357 -0.8364 6.0269 0.7092  0.0161
## HW    -892.2577 40338.207 25687.094 -0.5431 4.0957 0.4519  0.0098
# I chose RMSE as the accuracy measure because it penalizes large forecast errors more heavily, which is important for this dataset since it contains a large shock. Based on RMSE, the best model is MA6 (RMSE = 7,087), and the worst model is SES (RMSE = 55,128) among the methods tested. MA6 performs best because its smoothing window balances noise reduction and responsiveness, while SES performs worst because it overweights recent observations and cannot handle the large structural change.
# Conclusion
# The transit ridership series shows stable and strong seasonal patterns before 2020, followed by a sharp drop during the COVID-19 period and a gradual recovery afterward. This indicates that while the long-term demand trend was steady, the pandemic created a temporary structural break in the series. Based on the best-performing forecast model, ridership is expected to continue increasing slowly over the next year as recovery progresses. Over the next two years, the series is likely to move closer to pre-pandemic levels, but may not fully return to the previous peak.